#!/usr/bin/perl


###	This script is part of the Zeitcrawler v1.3 (http://code.google.com/p/zeitcrawler/).
###	Copyright (C) Adrien Barbaresi 2011-2013.
###	This is free software, released under the GNU GPL v3 license (http://www.gnu.org/licenses/gpl.html).

## WORK IN PROGRESS ! This is not a mature script.
## Please check what this script does before executing it

# Splits the crawl archive ("ZEIT_flatfile") into multiples XML files (metadata and text), store them in a folder named "docs".


use strict;
use warnings;
use utf8;


my $text;
my $docnr = 0; my (%titles, %excerpts, $title, $excerpt, @temp, $metatext);
my $duplicate = 0;


my $output = "docs/text";
open (TEXT, ">", $output) or die "Can't open $output: $!";
#print OUTPUT "<!--File generated by the zeitcrawler : http://code.google.com/p/zeitcrawler/-->\n";


my $input = "ZEIT_flatfile";
open (INPUT, "<", $input) or die "Can't open $input: $!";
while (<INPUT>) {
	next if ($_ =~ m/^<li/);
	next if ($_ =~ m/^\/\//);
	next if ($_ =~ m/]]>/);
	if ($_ =~ m/^Titel: /) {
		$title = $_;
		$title =~ s/^Titel: //;
		if ( (length($title) > 8 ) && (exists $titles{$title}) ) {
			$duplicate++;
		}
		$_ = xmlize($_);
	}
	elsif ($_ =~ m/^Excerpt: /) {
		$excerpt = $_;
		$excerpt =~ s/^Excerpt: //;
		if ( (length($excerpt) > 10 ) && (exists $excerpts{$excerpt}) ) {
			$duplicate++;
		}
		$_ = xmlize($_);
	}
	elsif ($_ =~ m/^Autor: /) {
		$_ = xmlize($_);
	}
	elsif ($_ =~ m/^Datum: /) {
		if ($_ !~ m/^Datum: [0-9]{2}\.[0-9]{2}\.[0-9]{4}/) {
			$_ =~ s/Datum: .+?$/Datum: /;
		}
		$_ = xmlize($_);
	}
	$text .= $_;
	if ($_ =~ m/^-----$/) {
		unless ($duplicate == 2) {
			$text =~ s/Titel: (.*?)\nExcerpt: (.*?)\nAutor: (.*?)\nDatum: (.*?)\nurl: (.*?)\n(.+?)-----/<doc titel="$1" untertitel="$2" autor="$3" datum="$4" url="$5">\n####$6/s;
			@temp = split ("####", $text);
			$metatext = $temp[0];
			$metatext =~ s/=" /="/g;
			$metatext =~ s/ "/"/g;
			$text = $temp[1];
			if ($text) {
			if (length($text)>10) {
				#$metatext = xmlize($metatext);
				$text = normalize($text);
				$docnr++;
				my $meta = "docs/meta-" . $docnr;
				open (META, ">", $meta) or die "Can't open $meta: $!";
				print META $metatext . "\n";
				print TEXT $text;
				print TEXT "%%%%%\n";
				$titles{$title}++; $excerpts{$excerpt}++;
				close(META);
				if ($docnr == 1000) {print "$docnr\n"; exit;}
			}}
		}
	$text = (); $duplicate = 0;
	}
}
close(INPUT);
close(TEXT);

print "$docnr\n";

sub normalize {
	my $string = shift;
	#$string =~ s/[„”„“”‚’]/"/g;
	$string =~ s/&amp;/&/g;
	return $string;
}

sub xmlize {
	my $string = shift;
	$string =~ s/<.+?>//g;
	#$string =~ s/(&#8220;|&#8221;|&#8222;|&#x201c;|&#x201e;)/&quot;/g;
	$string =~ s/["„“”‚’]/&quot;/g;
	$string =~ s/&/&amp;/g;
	$string =~ s/('|&#8216;)/&apos;/g;
	$string =~ s/'/&apos;/g;
	$string =~ s/>/&gt;/g;
	$string =~ s/</&lt;/g;
	$string =~ s/&amp;quot;/&quot;/g;
	#$string =~ s/&#039;/&rsquo;/g;
	#$string =~ s/&gt;&gt;//g;
	#$string =~ s/&lt;&lt;//g;
	#$string =~ s/&#x2013;/\-\-/g;
	#$string =~ s/(&#8211;|&#8212;)/–/g;
	#$string =~ s/&#8364;/€/g;
	return $string;
}
